In [1]:
import gzip
import json
import pandas as pd

from collections import defaultdict, Counter

In [2]:
%%time
data = []
media_types = defaultdict(int)
url_types = defaultdict(int)
has_urls = 0
unique_urls = set()
with gzip.open("all_ids.txt.json.gz") as fp:
    for line in fp:
        d = json.loads(line.strip())
        data.append(d)
        if 'entities' not in d:
            continue
        if 'media' in d['entities']:
            m_entities = d['entities']['media']
            for m in m_entities:
                m_type = m['type']
                media_types[m_type] += 1
        if 'urls' in d['entities']:
            m_entities = d['entities']['urls']
            if len(m_entities) > 0:
                has_urls += 1
            for m in m_entities:
                media_types['url'] += 1
                m = m['expanded_url']
                m_type = m.split("/", 3)[2]
                unique_urls.add((m, m_type))
                url_types[m_type] += 1
                
print(media_types)
url_types = Counter(url_types)
print("Of {} tweets, {} contain a total of {} urls with {} unique domains and {} unique urls".format(
        len(data), has_urls, media_types["url"], len(url_types), len(unique_urls)))


defaultdict(<type 'int'>, {'url': 166670, u'photo': 27682})
Of 328318 tweets, 162032 contain a total of 166670 urls with 8750 unique domains and 119558 unique urls
CPU times: user 1min 6s, sys: 4.46 s, total: 1min 11s
Wall time: 1min 11s

In [3]:
url_types.most_common(50)


Out[3]:
[(u'twitter.com', 24978),
 (u'bit.ly', 15148),
 (u'fb.me', 15069),
 (u'ow.ly', 6866),
 (u'dlvr.it', 5398),
 (u'ift.tt', 4693),
 (u'goo.gl', 4039),
 (u'ln.is', 3795),
 (u'youtu.be', 3784),
 (u'gvwy.io', 3120),
 (u'www.instagram.com', 2761),
 (u'buff.ly', 2331),
 (u'www.newsweek.com', 1949),
 (u'www.youtube.com', 1170),
 (u'nyti.ms', 1119),
 (u'tinyurl.com', 1083),
 (u'wp.me', 1048),
 (u'm.tbnn.it', 960),
 (u'shar.es', 845),
 (u'www.naturalnews.com', 798),
 (u'warontherocks.com', 739),
 (u'truthinmedia.com', 677),
 (u'cnn.it', 670),
 (u'rover.ebay.com', 604),
 (u'dld.bz', 524),
 (u'www.periscope.tv', 515),
 (u'lnkd.in', 504),
 (u'www.huffingtonpost.com', 486),
 (u'b.autovist.com', 473),
 (u'fxn.ws', 468),
 (u'www.breitbart.com', 461),
 (u'www.facebook.com', 421),
 (u'www.nytimes.com', 416),
 (u'n.pr', 405),
 (u'www.infowars.com', 404),
 (u'a.msn.com', 397),
 (u'thefederalist.com', 385),
 (u'apple.news', 379),
 (u'go.shr.lc', 378),
 (u'NaturalNews.com', 373),
 (u'www.foxnews.com', 362),
 (u'wpo.st', 350),
 (u'pinterest.com', 346),
 (u'www.cnn.com', 325),
 (u'www.yahoo.com', 319),
 (u'amzn.to', 317),
 (u'on.mash.to', 316),
 (u'wapo.st', 314),
 (u'brev.is', 310),
 (u'j.mp', 305)]

In [4]:
sorted(unique_urls,
                      key=lambda x: url_types[x[1]],
                     reverse=True)[:10]


Out[4]:
[(u'https://twitter.com/i/web/status/787248028335808513', u'twitter.com'),
 (u'https://twitter.com/mr_dsantos/status/792410135582875648', u'twitter.com'),
 (u'https://twitter.com/i/web/status/789400744810024960', u'twitter.com'),
 (u'https://twitter.com/candy_lass/status/692590229069254656', u'twitter.com'),
 (u'https://twitter.com/i/web/status/791387309992280064', u'twitter.com'),
 (u'https://twitter.com/_ijmtybx/status/743864533089947648', u'twitter.com'),
 (u'https://twitter.com/i/web/status/784460833912975360', u'twitter.com'),
 (u'https://twitter.com/i/web/status/792218124707729408', u'twitter.com'),
 (u'https://twitter.com/CaptainCreole/status/798946586730659840',
  u'twitter.com'),
 (u'https://twitter.com/tazerblack/status/786997527560224769', u'twitter.com')]

Run code to get all URLs

with open("all_urls.txt", "wb+") as fp:
    for url in sorted(filter(lambda x: x[1] != 'twitter.com',
            unique_urls),
                      key=lambda x: url_types[x[1]],
                     reverse=True):
        print >> fp, "%s\t%s\t%s" % (url[0], url[1], url_types[url[1]])

! head all_urls.txt

In [5]:
len(data)


Out[5]:
328318

In [6]:
data[0].keys()


Out[6]:
[u'contributors',
 u'truncated',
 u'text',
 u'is_quote_status',
 u'in_reply_to_status_id',
 u'id',
 u'favorite_count',
 u'source',
 u'quoted_status_id',
 u'retweeted',
 u'coordinates',
 u'quoted_status',
 u'entities',
 u'in_reply_to_screen_name',
 u'id_str',
 u'retweet_count',
 u'in_reply_to_user_id',
 u'favorited',
 u'user',
 u'geo',
 u'in_reply_to_user_id_str',
 u'possibly_sensitive',
 u'lang',
 u'created_at',
 u'quoted_status_id_str',
 u'in_reply_to_status_id_str',
 u'place']

In [7]:
data[0][u'source']


Out[7]:
u'<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>'

In [8]:
data[0][u'is_quote_status']


Out[8]:
True

In [9]:
data[0][u'quoted_status']['text']


Out[9]:
u'Overnight apartment fire in Tampa #10News https://t.co/gDBsG8udFg'

In [10]:
data[0]['text']


Out[10]:
u'Getting a better look at the damage now that the sun is up.  Very sad https://t.co/DZrhrubgf9'

In [11]:
count_quoted = 0
has_coordinates = 0
count_replies = 0
language_ids = defaultdict(int)
count_user_locs = 0
user_locs = Counter()
count_verified = 0
for d in data:
    count_quoted += d.get('is_quote_status', 0)
    coords = d.get(u'coordinates', None)
    repl_id = d.get(u'in_reply_to_status_id', None)
    has_coordinates += (coords is not None)
    count_replies += (repl_id is not None)
    loc = d['user'].get('location', u'')
    count_verified += d['user']['verified']
    if loc != u'':
        count_user_locs += 1
        user_locs.update([loc])
    language_ids[d['lang']] += 1
    
print count_quoted, has_coordinates, count_replies, count_user_locs, count_verified
print("Of {} tweets, {} have coordinates, while {} have user locations, comprising of {} unique locations".format(
        len(data), has_coordinates, count_user_locs, len(user_locs)
    ))


21382 646 53296 281811 11366
Of 328318 tweets, 646 have coordinates, while 281811 have user locations, comprising of 52421 unique locations

In [12]:
user_locs.most_common(10)


Out[12]:
[(u'United States', 10420),
 (u'USA', 7880),
 (u'Washington, DC', 4310),
 (u'New York, NY', 3082),
 (u'California, USA', 3018),
 (u'Los Angeles, CA', 2719),
 (u'New York', 2312),
 (u'Chicago, IL', 2179),
 (u'New York, USA', 2021),
 (u'Texas', 1773)]

In [13]:
len(data)


Out[13]:
328318

In [14]:
data[0]['user']


Out[14]:
{u'contributors_enabled': False,
 u'created_at': u'Tue Jul 14 00:13:13 +0000 2009',
 u'default_profile': False,
 u'default_profile_image': False,
 u'description': u'Executive Producer at 10News WTSP in Tampa/St. Petersburg. Indiana University graduate.',
 u'entities': {u'description': {u'urls': []}},
 u'favourites_count': 345,
 u'follow_request_sent': False,
 u'followers_count': 573,
 u'following': False,
 u'friends_count': 503,
 u'geo_enabled': True,
 u'has_extended_profile': False,
 u'id': 56544119,
 u'id_str': u'56544119',
 u'is_translation_enabled': False,
 u'is_translator': False,
 u'lang': u'en',
 u'listed_count': 68,
 u'location': u'St. Petersburg',
 u'name': u'Melissa Ramsey',
 u'notifications': False,
 u'profile_background_color': u'0099B9',
 u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme4/bg.gif',
 u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme4/bg.gif',
 u'profile_background_tile': False,
 u'profile_banner_url': u'https://pbs.twimg.com/profile_banners/56544119/1443718335',
 u'profile_image_url': u'http://pbs.twimg.com/profile_images/743866585635491840/Pa-vBAru_normal.jpg',
 u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/743866585635491840/Pa-vBAru_normal.jpg',
 u'profile_link_color': u'0099B9',
 u'profile_sidebar_border_color': u'5ED4DC',
 u'profile_sidebar_fill_color': u'95E8EC',
 u'profile_text_color': u'3C3940',
 u'profile_use_background_image': True,
 u'protected': False,
 u'screen_name': u'mramsey8',
 u'statuses_count': 1010,
 u'time_zone': u'Central Time (US & Canada)',
 u'translator_type': u'none',
 u'url': None,
 u'utc_offset': -21600,
 u'verified': False}

Load expanded data


In [15]:
df = pd.read_csv("URL_CAT_MAPPINGS.txt", sep="\t")
df.head()


Out[15]:
URL EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS
0 http://www.investmentnews.com/article/20160801... http://www.investmentnews.com/article/20160801... 0 investmentnews.com UNK
1 http://ow.ly/3avNPe https://www.reddit.com/r/cahideas/comments/42i... 0 reddit.com socialmedia
2 http://stratcom.kma-assc.com/uncategorized/pre... http://stratcom.kma-assc.com/uncategorized/pre... 3 stratcom.kma-assc.com UNK
3 http://ln.is/mabelsaveforschool.com/gbEtv http://linkis.com/mabelsaveforschool.com/gbEtv 0 mabelsaveforschool.com commercial
4 http://kiw.im/16LfJirkfzE https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927 0 kiwi.qa UNK

In [16]:
df['URL_EXP_SUCCESS'] = (df.EXPANDED_STATUS < 2)
df.head()


Out[16]:
URL EXPANDED EXPANDED_STATUS URL_DOMAIN URL_CATS URL_EXP_SUCCESS
0 http://www.investmentnews.com/article/20160801... http://www.investmentnews.com/article/20160801... 0 investmentnews.com UNK True
1 http://ow.ly/3avNPe https://www.reddit.com/r/cahideas/comments/42i... 0 reddit.com socialmedia True
2 http://stratcom.kma-assc.com/uncategorized/pre... http://stratcom.kma-assc.com/uncategorized/pre... 3 stratcom.kma-assc.com UNK False
3 http://ln.is/mabelsaveforschool.com/gbEtv http://linkis.com/mabelsaveforschool.com/gbEtv 0 mabelsaveforschool.com commercial True
4 http://kiw.im/16LfJirkfzE https://kiwi.qa/LFHKX8RLIFI7O8/39656070290663927 0 kiwi.qa UNK True

In [17]:
URL_DICT = dict(zip(df[df.URL_CATS != 'UNK'].URL, df[df.URL_CATS != 'UNK'].URL_CATS))
URL_MAPS = dict(zip(df.URL, df.URL_DOMAIN))
URL_EXP_SUCCESS = dict(zip(df.URL, df.URL_EXP_SUCCESS))
len(URL_DICT), df.shape, len(URL_MAPS), len(URL_EXP_SUCCESS)


Out[17]:
(60586, (97512, 6), 97512, 97512)

In [18]:
df.URL.head().values


Out[18]:
array([ 'http://www.investmentnews.com/article/20160801/FREE/160809992/if-history-is-a-guide-market-volatility-is-about-to-spike',
       'http://ow.ly/3avNPe',
       'http://stratcom.kma-assc.com/uncategorized/press-releases-visit-of-republic-of-korea-r-o-k-deputy-national-security-advisor-cho-tae-yong/',
       'http://ln.is/mabelsaveforschool.com/gbEtv',
       'http://kiw.im/16LfJirkfzE'], dtype=object)

In [19]:
URL_MAPS['http://bit.ly/1SqTn5d']


Out[19]:
'examiner.com'

In [20]:
found_urls = 0
twitter_urls = 0
total_urls = 0
tid_mapped_urls = []
url_types = defaultdict(int)
for d in data:
    if 'urls' in d['entities']:
            m_entities = d['entities']['urls']
            for m in m_entities:
                total_urls += 1
                m = m['expanded_url']
                m_cats = "UNK"
                if m in URL_DICT:
                    found_urls += 1
                    m_cats = URL_DICT[m]
                elif m.startswith("https://twitter.com") or m.startswith("http://twitter.com"):
                    found_urls += 1
                    twitter_urls += 1
                    m_cats = "socialmedia|twitter"
                else:
                    m_type = "failed_url"
                    if URL_EXP_SUCCESS[m]:
                        m_type = URL_MAPS.get(m, "None.com")
                    """
                    m_type = m.split("/", 3)[2]
                    #m_type = m_type.split("/", 3)[2]
                    if m_type.startswith("www."):
                        m_type = m_type[4:]
                    """
                    url_types[m_type] += 1
                tid_mapped_urls.append((d["id"], m, m_cats))
print "Data: %s, Total: %s, Found: %s, Twitter: %s" % (len(data), total_urls, found_urls, twitter_urls)
url_types = Counter(url_types)
url_types.most_common(10)


Data: 328318, Total: 166670, Found: 118384, Twitter: 24978
Out[20]:
[('failed_url', 1749),
 ('informationexclusives.com', 170),
 ('toonradio.net', 168),
 ('conservativereport.org', 133),
 ('tallwellnutrition.com', 130),
 ('greenmedinfo.com', 122),
 ('sherif.ws', 105),
 ('americas-fs.org', 104),
 ('mediaite.com', 90),
 ('conservativereview.com', 89)]

In [21]:
url_types.most_common(50)


Out[21]:
[('failed_url', 1749),
 ('informationexclusives.com', 170),
 ('toonradio.net', 168),
 ('conservativereport.org', 133),
 ('tallwellnutrition.com', 130),
 ('greenmedinfo.com', 122),
 ('sherif.ws', 105),
 ('americas-fs.org', 104),
 ('mediaite.com', 90),
 ('conservativereview.com', 89),
 ('thinkprogress.org', 88),
 ('massageenvy.com', 85),
 ('indiewire.com', 83),
 ('webogi.com', 80),
 ('com', 78),
 ('amp.twimg.com', 73),
 ('csoonline.com', 72),
 ('infantway.com', 69),
 ('hotair.com', 69),
 ('alertseditor.com', 68),
 ('teaparty.org', 68),
 ('a.bla.es', 67),
 ('ww1.news-freak.com', 66),
 ('vaccines.news', 65),
 ('reason.com', 65),
 ('smartbrief.com', 65),
 ('talknetwork.com', 64),
 ('newslocker.com', 64),
 ('empleoya.es', 63),
 ('healthlogics.press', 63),
 ('30dayfortune.com', 63),
 ('reverbnation.com', 62),
 ('trap.it', 61),
 ('theconversation.com', 60),
 ('rietta.com', 59),
 ('prnewswire.com', 59),
 ('choiceandtruth.com', 59),
 ('healthy-holistic-living.com', 58),
 ('blogs.wsj.com', 58),
 ('disq.us', 57),
 ('sun-sentinel.com', 57),
 ('wakingtimes.com', 56),
 ('myeclinik.com', 56),
 ('blog.tenthamendmentcenter.com', 56),
 ('usalovelist.com', 56),
 ('therealnews.com', 56),
 ('snopes.com', 56),
 ('theusualroutine.com', 55),
 ('finance.yahoo.com', 55),
 ('organiclifestylemagazine.com', 55)]

In [22]:
sum(url_types.values())


Out[22]:
48286

In [23]:
tid_mapped_urls[:10]


Out[23]:
[(682904901916225536,
  u'https://twitter.com/photogchad_WTSP/status/682903997288681472',
  'socialmedia|twitter'),
 (682915876316692480, u'http://www.investirdanslenfance.ca/', 'UNK'),
 (682985833821941760, u'http://TinyURL.com/NewYearCure', 'commercial'),
 (682952771746664448, u'http://TinyURL.com/NewYearCure', 'commercial'),
 (682830450969059328,
  u'http://yournewswire.com/donald-trump-vaccines-cause-autism/',
  'fakenews'),
 (682998926157418496, u'http://go.shr.lc/1Cq6myS', 'videos'),
 (682924083126767616,
  u'http://www.walesonline.co.uk/news/education/we-improve-education-minister-huw-10630807#ICID=ios_WalesOnlineNewsApp_AppShare_Click_Other',
  'news'),
 (682929300241133569,
  u'https://www.youreducationguides.com/single-parent-involvement/',
  'UNK'),
 (682949413543739392,
  u'http://www.globalresearch.ca/measles-vaccines-kill-more-people-than-measles-cdc-data-proves/5429736',
  'clickbait|fakenews'),
 (682946899259723779, u'http://goo.gl/no4E42', 'commercial')]

In [24]:
df_mapped_cats = pd.DataFrame(tid_mapped_urls, columns=["TID", "URL", "CATS"])
df_mapped_cats.head()


Out[24]:
TID URL CATS
0 682904901916225536 https://twitter.com/photogchad_WTSP/status/682... socialmedia|twitter
1 682915876316692480 http://www.investirdanslenfance.ca/ UNK
2 682985833821941760 http://TinyURL.com/NewYearCure commercial
3 682952771746664448 http://TinyURL.com/NewYearCure commercial
4 682830450969059328 http://yournewswire.com/donald-trump-vaccines-... fakenews

In [25]:
df_mapped_cats.to_csv("TID_URL_CATS.txt", sep="\t", index=False)
! head TID_URL_CATS.txt


TID	URL	CATS
682904901916225536	https://twitter.com/photogchad_WTSP/status/682903997288681472	socialmedia|twitter
682915876316692480	http://www.investirdanslenfance.ca/	UNK
682985833821941760	http://TinyURL.com/NewYearCure	commercial
682952771746664448	http://TinyURL.com/NewYearCure	commercial
682830450969059328	http://yournewswire.com/donald-trump-vaccines-cause-autism/	fakenews
682998926157418496	http://go.shr.lc/1Cq6myS	videos
682924083126767616	http://www.walesonline.co.uk/news/education/we-improve-education-minister-huw-10630807#ICID=ios_WalesOnlineNewsApp_AppShare_Click_Other	news
682929300241133569	https://www.youreducationguides.com/single-parent-involvement/	UNK
682949413543739392	http://www.globalresearch.ca/measles-vaccines-kill-more-people-than-measles-cdc-data-proves/5429736	clickbait|fakenews

Extract tweet features


In [26]:
def extract_meta_features(x):
    u_data = x["user"]
    u_url = u_data['url']
    if u_url is not None:
        u_url = u_data['entities']['url']['urls'][0]['expanded_url']
    return (x["id"],
            x['created_at'],
            x['retweet_count'],
            x['favorite_count'], 
            x['in_reply_to_status_id'] is not None,
            'quoted_status' in x and x['quoted_status'] is not None,
            len(x['entities']['hashtags']),
            len(x['entities']['urls']),
            len(x['entities']['user_mentions']),
            0 if 'media' not in x['entities'] else len(x['entities']['media']), # Has photos
            u_data['id'],
            u_data[u'created_at'],
            u_data[u'listed_count'],
            u_data[u'favourites_count'],
            u_data[u'followers_count'],
            u_data[u'friends_count'],
            u_data[u'statuses_count'],
            u_data[u'verified'],
            u_data[u'location'].replace('\r', ''),
            u_data[u'name'].replace('\r',''),
            u_url
           )

In [27]:
extract_meta_features(data[0])


Out[27]:
(682904901916225536,
 u'Fri Jan 01 12:43:11 +0000 2016',
 0,
 0,
 False,
 True,
 0,
 1,
 0,
 0,
 56544119,
 u'Tue Jul 14 00:13:13 +0000 2009',
 68,
 345,
 573,
 503,
 1010,
 False,
 u'St. Petersburg',
 u'Melissa Ramsey',
 None)

In [28]:
df_meta = pd.DataFrame((extract_meta_features(d) for d in data),
                      columns=["t_id", "t_created", "t_retweets",
                              "t_favorites", "t_is_reply", "t_is_quote",
                              "t_n_hashtags", "t_n_urls", "t_n_mentions",
                              "t_n_media",
                               "u_id", "u_created",
                               "u_n_listed", "u_n_favorites", "u_n_followers",
                               "u_n_friends", "u_n_statuses",
                               "u_is_verified", "u_location", "u_name", "u_url"
                              ])
df_meta.head()


Out[28]:
t_id t_created t_retweets t_favorites t_is_reply t_is_quote t_n_hashtags t_n_urls t_n_mentions t_n_media ... u_created u_n_listed u_n_favorites u_n_followers u_n_friends u_n_statuses u_is_verified u_location u_name u_url
0 682904901916225536 Fri Jan 01 12:43:11 +0000 2016 0 0 False True 0 1 0 0 ... Tue Jul 14 00:13:13 +0000 2009 68 345 573 503 1010 False St. Petersburg Melissa Ramsey None
1 682915876316692480 Fri Jan 01 13:26:47 +0000 2016 100 7 False False 0 1 0 0 ... Fri Mar 11 07:55:47 +0000 2011 48 123 24864 6101 3594 False United States Bree Victorie None
2 682985833821941760 Fri Jan 01 18:04:46 +0000 2016 2 0 True False 2 1 1 0 ... Sun Oct 19 18:44:28 +0000 2008 1151 8709 20635 22698 207314 False USA TannersDad Tim http://www.AgeofAutism.com
3 682952771746664448 Fri Jan 01 15:53:24 +0000 2016 1 0 False False 4 1 0 0 ... Sun Oct 19 18:44:28 +0000 2008 1151 8709 20635 22698 207314 False USA TannersDad Tim http://www.AgeofAutism.com
4 682843745520238592 Fri Jan 01 08:40:10 +0000 2016 0 0 False False 0 0 0 0 ... Tue Apr 13 09:22:10 +0000 2010 6 4 255 83 37291 False All Over Los Angeles Duke None

5 rows × 21 columns


In [29]:
df_meta.dtypes


Out[29]:
t_id              int64
t_created        object
t_retweets        int64
t_favorites       int64
t_is_reply         bool
t_is_quote         bool
t_n_hashtags      int64
t_n_urls          int64
t_n_mentions      int64
t_n_media         int64
u_id              int64
u_created        object
u_n_listed        int64
u_n_favorites     int64
u_n_followers     int64
u_n_friends       int64
u_n_statuses      int64
u_is_verified      bool
u_location       object
u_name           object
u_url            object
dtype: object

In [30]:
df_meta[df_meta.u_url.apply(lambda x: x is not None)]["u_url"].head()


Out[30]:
2     http://www.AgeofAutism.com
3     http://www.AgeofAutism.com
5    http://theskepticsguide.org
6    http://google.com/+HelenKap
8         http://www.ralajoy.com
Name: u_url, dtype: object

In [31]:
df_meta.to_csv("TID_META.txt", sep="\t", index=False, encoding='utf-8')
! head TID_META.txt


t_id	t_created	t_retweets	t_favorites	t_is_reply	t_is_quote	t_n_hashtags	t_n_urls	t_n_mentions	t_n_media	u_id	u_created	u_n_listed	u_n_favorites	u_n_followers	u_n_friends	u_n_statuses	u_is_verified	u_location	u_name	u_url
682904901916225536	Fri Jan 01 12:43:11 +0000 2016	0	0	False	True	0	1	0	0	56544119	Tue Jul 14 00:13:13 +0000 2009	68	345	573	503	1010	False	St. Petersburg	Melissa Ramsey	
682915876316692480	Fri Jan 01 13:26:47 +0000 2016	100	7	False	False	0	1	0	0	264062848	Fri Mar 11 07:55:47 +0000 2011	48	123	24864	6101	3594	False	United States	Bree Victorie	
682985833821941760	Fri Jan 01 18:04:46 +0000 2016	2	0	True	False	2	1	1	0	16854311	Sun Oct 19 18:44:28 +0000 2008	1151	8709	20635	22698	207314	False	USA	TannersDad Tim	http://www.AgeofAutism.com
682952771746664448	Fri Jan 01 15:53:24 +0000 2016	1	0	False	False	4	1	0	0	16854311	Sun Oct 19 18:44:28 +0000 2008	1151	8709	20635	22698	207314	False	USA	TannersDad Tim	http://www.AgeofAutism.com
682843745520238592	Fri Jan 01 08:40:10 +0000 2016	0	0	False	False	0	0	0	0	132465703	Tue Apr 13 09:22:10 +0000 2010	6	4	255	83	37291	False	All Over Los Angeles	Duke	
682832248626876416	Fri Jan 01 07:54:29 +0000 2016	0	0	True	False	0	0	1	0	160073590	Sun Jun 27 03:34:29 +0000 2010	5	41	188	211	28610	False	860	Connor Durden	http://theskepticsguide.org
682830450969059328	Fri Jan 01 07:47:20 +0000 2016	0	0	False	False	1	1	1	0	1216115161	Sun Feb 24 17:39:27 +0000 2013	12	624	428	821	6287	False	Seattle	Helen Kap	http://google.com/+HelenKap
682895562534862848	Fri Jan 01 12:06:04 +0000 2016	0	0	False	False	0	0	0	0	2339295488	Tue Feb 11 22:08:57 +0000 2014	124	1	349	57	1085424	False		Ranier Gray	
682973867627941889	Fri Jan 01 17:17:13 +0000 2016	0	0	False	False	1	0	0	0	1604405960	Thu Jul 18 21:40:24 +0000 2013	21	119	370	708	1558	False		rala brubaker	http://www.ralajoy.com

In [32]:
df_meta[df_meta.u_url.apply(lambda x: x is not None)]["u_url"].shape


Out[32]:
(170675,)

In [33]:
df_meta.shape


Out[33]:
(328318, 21)

In [ ]: